{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#  1. Prepare data\n",
    "* read from redis\n",
    "* parse the title and abstract\n",
    "* calculate the term frequency and document frequency\n",
    "* build character 2 index dict and index to character index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import redis\n",
    "import json\n",
    "import h5py\n",
    "import pickle\n",
    "import numpy as np\n",
    "import random\n",
    "import tensorflow as tf\n",
    "from tensorflow.contrib import rnn\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "class Char:\n",
    "    def __init__(self,val,tf,df):\n",
    "        self.val = val\n",
    "        self.tf = tf\n",
    "        self.df = df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def parse_all_crawled_data():\n",
    "    res = []\n",
    "    keys =  r.keys()\n",
    "    print \"Get [%s] Docs\"%(len(keys))\n",
    "    for data in r.mget(keys):\n",
    "        data = json.loads(data)\n",
    "        key = data.get(\"group_id\")\n",
    "        title = data.get(\"title\").replace('\\t',' ')\n",
    "        abstract = data.get(\"abstract\").replace('\\t',' ')\n",
    "        res.append((key,title,abstract))\n",
    "    return res    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def cal_char_tf_df(corpus):\n",
    "    chars = {}\n",
    "    for doc in corpus:    \n",
    "        title, abstract = doc[1],doc[2]\n",
    "        # traverse every char in the text\n",
    "        text = (title + abstract).lower()\n",
    "        # acumulate the term frequency\n",
    "        for char in text:\n",
    "            if not chars.get(char):\n",
    "                chars[char] = Char(val = char,tf = 1,df = 0)\n",
    "            else:\n",
    "                chars[char].tf += 1\n",
    "        # acummulate the doc frequency\n",
    "        for char in set(text):\n",
    "            chars[char].df += 1\n",
    "    return chars"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def build_idx_for_chars_tf_df(chars,tf_thres = 12,df_thres = 6):\n",
    "    id_beg = 0\n",
    "    id_eos = 1\n",
    "    id_emp = 2\n",
    "    id_unk = 3\n",
    "    \n",
    "    start_idx = id_unk + 1\n",
    "\n",
    "    char2idx = {}\n",
    "    idx2char = {}\n",
    "\n",
    "    char2idx['<eos>'] = id_eos\n",
    "    char2idx['<unk>'] = id_unk\n",
    "    char2idx['<emp>'] = id_emp\n",
    "    char2idx['<beg>'] = id_beg\n",
    "    #filter out tf>20 and df > 10 terms\n",
    "    chars = filter(lambda char:char.tf > tf_thres and char.df > df_thres,chars)\n",
    "    char2idx.update(dict([(char.val,start_idx + idx) for idx,char in enumerate(chars)]))\n",
    "    idx2char = dict([(idx,char) for char,idx in char2idx.items()])\n",
    "    return char2idx, idx2char\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def prt(label, x):\n",
    "    print label+':',\n",
    "    for w in x:\n",
    "        if w == id_emp:\n",
    "            continue\n",
    "        print idx2char[w],\n",
    "    print"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "1. Prepare Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "use the stored data\n",
      "vocabsize is :[3525]\n",
      "H: 白 百 何 羽 凡 感 情 破 裂 豪 宅 被 曝 ， 情 虽 断 但 这 家 还 在 装 修 没 得 说 ！\n",
      "D: 在 白 百 合 出 轨 事 前 ， 她 和 陈 羽 凡 是 一 对 羡 煞 旁 人 的 恩 爱 夫 妻 。 此 前 ， 羽 凡 凭 着 优 秀 的 音 乐 拿 到 了 某 音 乐 节 上 的 歌 王 ， 然 后 白 百 合 这 边 便 随 着 《 失 恋 3 3 天 》 火 了 。 从 此 ， 百 合 的 星 途 就 是 一 路 绿 灯 ， 然 后 凭 借 着 一 张 更 具 有 灵 气 的 脸 蛋 抢 了 王 <unk> 丹 的 地 位 ， 后 来 更 是 随 着 《 捉 妖 记 》 名 气 大 增 ， 成 为 内 地 一 线 女 演 员 。\n",
      "1000\n",
      "25000\n"
     ]
    }
   ],
   "source": [
    "id_beg = 0\n",
    "id_eos = 1\n",
    "id_emp = 2\n",
    "id_unk = 3\n",
    "\n",
    "total_samples = 26000\n",
    "val_samples = 1000\n",
    "train_samples = total_samples - val_samples\n",
    "\n",
    "\n",
    "DataFile = \"data/basic_data_tf.pkl\"\n",
    "UseStoredData = True\n",
    "\n",
    "if UseStoredData:\n",
    "    print \"use the stored data\"\n",
    "    char2idx, idx2char,X_train, X_test, Y_train, Y_test = pickle.load(open(DataFile))\n",
    "else:\n",
    "    r = redis.StrictRedis(host='localhost', port=6379, db=0)\n",
    "    corpus = parse_all_crawled_data()\n",
    "    chars_dict = cal_char_tf_df(corpus)\n",
    "\n",
    "    print \"Got [%s] Uniq charaters\"%len(chars_dict)\n",
    "    chars_tf_reverse = sorted(chars_dict.values(),key = lambda x:x.tf,reverse = True)\n",
    "    print \"the Top 10 are:\"\n",
    "    print \"\\n\".join([\"%s\\t%s\\t%s\" %(char.val,char.tf,char.df) for char in chars_tf_reverse[:10]])\n",
    "\n",
    "    char2idx, idx2char = build_idx_for_chars_tf_df(chars_dict.values())\n",
    "    titles = [[char2idx.get(char,id_unk) for char in doc[1]] for doc in corpus][:total_samples]\n",
    "    abstracts = [[char2idx.get(char,id_unk) for char in doc[2]] for doc in corpus][:total_samples]\n",
    "        \n",
    "    from sklearn.model_selection import train_test_split\n",
    "    X_train, X_test, Y_train, Y_test = train_test_split(abstracts, titles, test_size=val_samples, random_state=10)\n",
    "    len(X_train), len(Y_train), len(X_test), len(Y_test)\n",
    "\n",
    "    pickle.dump((char2idx, idx2char,X_train, X_test, Y_train, Y_test),open(DataFile,\"wb\"),-1)\n",
    "    \n",
    "\n",
    "\n",
    "vocab_size = len(char2idx)\n",
    "print \"vocabsize is :[%d]\"%vocab_size\n",
    "i = random.randint(0,len(X_train))\n",
    "prt('H',Y_train[i])\n",
    "prt('D',X_train[i])\n",
    "\n",
    "print len(X_test)\n",
    "print len(X_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 2. Model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. parameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "learning_rate = 0.001\n",
    "batch_size = 32\n",
    "display_step = 10\n",
    "dropout_keep_prob = 1.0\n",
    "\n",
    "\n",
    "maxlena=150 # 0 - if we dont want to use description at all\n",
    "maxlent=40\n",
    "maxlen = maxlena + maxlent\n",
    "maxlenh = maxlent\n",
    "maxlend = maxlena\n",
    "\n",
    "vocab_size = len(char2idx)\n",
    "embedding_size = 100\n",
    "\n",
    "empty = id_emp\n",
    "eos = id_eos\n",
    "unk = id_unk\n",
    "beg = id_beg\n",
    "\n",
    "\n",
    "# for cnn encoder use\n",
    "filter_sizes = [2,3,4,5,6,8,10,13]\n",
    "num_filters = 16\n",
    "\n",
    "# for rnn deocoder use ,GRU cell memory size. same as encoder state\n",
    "memory_dim = 128"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "encoder_inputs = tf.placeholder(tf.int32, shape=[None,maxlend], name='encoder_inputs')\n",
    "decoder_targets = tf.placeholder(tf.int32,shape=(None, maxlenh), name='decoder_targets')\n",
    "decoder_inputs = tf.placeholder(tf.int32, [None, maxlenh], name = \"decoder_inputs\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "embeddings = tf.Variable(\n",
    "    tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0),name=\"embeddings\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# cnn as encode\n",
    "def CNNEncoder(encoder_inputs):\n",
    "    #train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])\n",
    "    encoder_inputs_embedded = tf.nn.embedding_lookup(embeddings, encoder_inputs)\n",
    "    # to expand one dim for CNN\n",
    "    embed_expanded = tf.expand_dims(encoder_inputs_embedded,-1)\n",
    "\n",
    "    pooled_outputs = []\n",
    "    for i, filter_size in enumerate(filter_sizes):\n",
    "        with tf.name_scope(\"conv-maxpool-%s\" % filter_size):\n",
    "            # Convolution Layer\n",
    "            filter_shape = [filter_size, embedding_size, 1, num_filters]\n",
    "            W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name=\"W\")\n",
    "            b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name=\"b\")\n",
    "            conv = tf.nn.conv2d(\n",
    "                embed_expanded,\n",
    "                W,  \n",
    "                strides=[1, 1, 1, 1], \n",
    "                padding=\"VALID\",\n",
    "                name=\"conv\")\n",
    "            # Apply nonlinearity\n",
    "            h = tf.nn.relu(tf.nn.bias_add(conv, b), name=\"relu\")\n",
    "            #print h.shape\n",
    "            # Max-pooling over the outputs\n",
    "            pooled = tf.nn.max_pool(\n",
    "                h,  \n",
    "                ksize=[1, maxlend - filter_size + 1, 1, 1], \n",
    "                strides=[1, 1, 1, 1], \n",
    "                padding='VALID',\n",
    "                name=\"pool\")          \n",
    "            pooled_outputs.append(pooled)\n",
    "    # Combine all the pooled features\n",
    "    num_filters_total = num_filters * len(filter_sizes)\n",
    "    h_pool = tf.concat(pooled_outputs,3)\n",
    "    #print h_pool.shape\n",
    "    h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])\n",
    "    #print h_pool_flat.shape\n",
    "\n",
    "    with tf.name_scope(\"dropout\"):\n",
    "        h_drop = tf.nn.dropout(h_pool_flat, dropout_keep_prob,name=\"dropout\")\n",
    "    return h_drop"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " RNN AS Decoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def RNNDecoder(encoder_state,decoder_inputs):\n",
    "    decoder_inputs_embedded = tf.nn.embedding_lookup(embeddings, decoder_inputs)\n",
    "    #from tensorflow.models.rnn import rnn_cell, seq2seq\n",
    "    cell = rnn.GRUCell(memory_dim)\n",
    "    decoder_outputs, decoder_final_state = tf.nn.dynamic_rnn(\n",
    "        cell, decoder_inputs_embedded,\n",
    "        initial_state=encoder_state,\n",
    "        dtype=tf.float32,scope=\"plain_decoder1\")\n",
    "    return decoder_outputs, decoder_final_state \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "encoder_state = CNNEncoder(encoder_inputs)\n",
    "decoder_outputs, _ = RNNDecoder(encoder_state,decoder_inputs)\n",
    "\n",
    "decoder_logits = tf.contrib.layers.linear(decoder_outputs, vocab_size)\n",
    "labels = tf.one_hot(decoder_targets, depth=vocab_size, dtype=tf.float32)\n",
    "stepwise_cross_entropy = tf.nn.softmax_cross_entropy_with_logits(\n",
    "    labels = labels,\n",
    "    logits=decoder_logits,\n",
    ")\n",
    "\n",
    "loss = tf.reduce_mean(stepwise_cross_entropy,name = \"loss\")\n",
    "\n",
    "decoder_prediction = tf.argmax(decoder_logits, 2,name = \"decoder_prediction\")\n",
    "\n",
    "train_op = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss,name = \"op_adam_minize\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "labels_ = tf.argmax(labels,2,name = \"labels_\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 3. Training:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def rpadd(x, maxlen=maxlenh, eos=eos,lpad=True,prefix=None):\n",
    "    assert maxlen >= 0\n",
    "    \n",
    "    if prefix != None:\n",
    "        x = [prefix] + x\n",
    "    n = len(x)\n",
    "    if n > maxlen - 1 :\n",
    "        x = x[:maxlen - 1]\n",
    "        n = maxlen - 1\n",
    "    res = x + [eos] + [empty] * (maxlen - n - 1) \n",
    "    assert len(res) == maxlen\n",
    "    return res"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "outfile = open(\"log/train.20170520.samples\",'w')\n",
    "import sys\n",
    "reload(sys)\n",
    "sys.setdefaultencoding(\"utf-8\")\n",
    "def prt(label, x,):\n",
    "    outfile.write((label+':').encode(\"utf-8\")),\n",
    "    for w in x:\n",
    "        if w == id_emp:\n",
    "            continue\n",
    "        outfile.write(idx2char[w].encode(\"utf-8\")),\n",
    "    outfile.write(\"\\n\")\n",
    "    outfile.flush()\n",
    "import logging\n",
    "logger = logging.getLogger('training')\n",
    "hdlr = logging.FileHandler('log/train.20170520.log')\n",
    "logger.addHandler(hdlr) \n",
    "logger.setLevel(logging.INFO)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "RESTORE = False\n",
    "batch_size = 128\n",
    "epocs = 1500\n",
    "\n",
    "saver = tf.train.Saver()\n",
    "with tf.Session() as sess: \n",
    "    \n",
    "    if RESTORE:\n",
    "        #First let's load meta graph and restore weights\n",
    "       \n",
    "        saver = tf.train.import_meta_graph('model/TitleGeneration-110.meta')\n",
    "        saver.restore(sess,tf.train.latest_checkpoint('model/'))\n",
    "    \n",
    "    sess.run(tf.global_variables_initializer())\n",
    "      \n",
    "    graph = tf.get_default_graph()\n",
    "    for i in range(epocs):\n",
    "        j = 0\n",
    "        while (j < len(X_train)):\n",
    "    \n",
    "            encoder_inputs_ = map(lambda x:rpadd(x,maxlend),X_train[j:j+batch_size])\n",
    "            decoder_inputs_ = map(lambda x:rpadd(x,maxlenh,prefix=beg),Y_train[j:j+batch_size])        \n",
    "            decoder_targets_ = map(lambda x:x[1:] + [empty],decoder_inputs_)\n",
    "    \n",
    "            j = j + batch_size\n",
    "            _,loss_,labels__,decoder_prediction_ = sess.run([train_op,loss,labels_,decoder_prediction],\n",
    "                feed_dict={\n",
    "                    encoder_inputs : encoder_inputs_,\n",
    "                    decoder_inputs : decoder_inputs_,\n",
    "                    decoder_targets : decoder_targets_\n",
    "            })\n",
    "    \n",
    "            \"\"\"\n",
    "            print \"encorder_inputs:\", encoder_inputs_\n",
    "            print \"decoder_inputs_:\", decoder_inputs_\n",
    "            print \"decoder_targets_\", decoder_targets_\n",
    "            print \"lables_\",labels__\n",
    "            print 'decoder_prediction is :' , decoder_prediction_\n",
    "            \"\"\"\n",
    "            if j % (batch_size * 30) == 0:\n",
    "                logger.info( \"Runing in EPOC[%d] Batch [%d] with loss [%f]\" %(i, j / batch_size,loss_))             \n",
    "                k = random.randint(0,len( encoder_inputs_)-1)\n",
    "                print \"-\" * 20\n",
    "                prt(\"[**描  述**]\",encoder_inputs_[k])\n",
    "                \n",
    "                test_x = []\n",
    "                test_encode_input = encoder_inputs_[k]\n",
    "                \n",
    "                for l in range(maxlenh):\n",
    "                    new_decoder_input = rpadd(test_x,maxlenh,prefix=beg)\n",
    "                    decoder_prediction_ = sess.run([decoder_prediction],\n",
    "                             feed_dict = {\n",
    "                                encoder_inputs : [test_encode_input],\n",
    "                                decoder_inputs : [new_decoder_input]\n",
    "                             }\n",
    "                    )\n",
    "                    test_x.append(decoder_prediction_[0][0][l])\n",
    "                    if decoder_prediction_[0][0][l] == eos:\n",
    "                        break\n",
    "                prt(\"[*预测标题*]\",test_x)                \n",
    "                #prt(\"[*预测标题*]\",decoder_prediction_[k])\n",
    "                prt(\"[*真实标题*]\",decoder_inputs_[k])\n",
    "                \n",
    "                print \"-\" * 20\n",
    "                \n",
    "        if i % 10 == 0:\n",
    "                saver.save(sess,\"model2/TitleGeneration\",global_step = i)\n",
    "    \n",
    "\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
